1 logistics

remember to change “root” to your home directory!

# load packages
pkgs = c("ape", "phangorn", "seqinr", "phytools", 
         "plotly", "dplyr", "tidyr", "Matrix",
         "geosphere", "leaflet")
pkgs_ui = setdiff(pkgs, rownames(installed.packages()))
if (length(pkgs_ui) > 0) install.packages(pkgs_ui, verbose=F)
sapply(pkgs, require, character.only=T)

# set paths
root = "~/projects/ai4all-sfu_bio"

seq_dir = paste(root, "/data/FASTA.fa", sep="") #original genetic sequence
meta_dir = paste(root, "/data/meta.csv", sep="")
align_dir = paste(root, "/data/alignment.fa", sep="")

result_dir_ = paste(root, "/result", sep=""); dir.create(result_dir_, showWarnings=F)

# use a previous result; set to "" if starting anew
result_time_ = "2018-07-09_20-09-04"

# options
nseq = 200 #number of sequences to sample; 2*nseq < total # of sequences
##       ape  phangorn    seqinr  phytools    plotly     dplyr     tidyr 
##      TRUE      TRUE      TRUE      TRUE      TRUE      TRUE      TRUE 
##    Matrix geosphere   leaflet 
##      TRUE      TRUE      TRUE

1.1 align influenza virus genetic sequences

# load sequence alignments; to align sequences, install mafft @ (https://mafft.cbrc.jp/alignment/software/)
if (!file.exists(align_dir)) system(paste("mafft ", seq_dir, " > ", align_dir, sep=""))
align = read.dna(align_dir, format="fasta")
strains_id = rownames(align)
strains = sapply(strsplit(strains_id,"[-]"), function(x) x[1])
dates = as.Date(sapply(strsplit(strains_id,"[-]"), function(x) x[2]))

1.2 load/subset results

#sample for more recent strains
set.seed(17)
result_time_ = ifelse(result_time_!="" & !dir.exists(paste(result_dir_, "/", result_time_, sep="")), "", result_time_)
result_time = ifelse(result_time_=="", gsub(" ", "_", gsub("[:]", "-", Sys.time())), result_time_)
result_dir = paste(result_dir_, "/", result_time, sep=""); dir.create(result_dir, showWarnings=F)
ind_dir = paste(result_dir, "/ind.csv", sep="")

if (result_time_ == "") {
  sam_ind = rownames(align)[sample(c(1:min(2*nseq,nrow(align))), nseq, replace=F, prob=NULL)]
  write.csv(sam_ind, file=ind_dir, row.names=F)
}
sam_ind = read.csv(ind_dir, stringsAsFactors=F)[,1]
sam_ind
##   [1] "CY234578-2017/04/17" "CY222989-2017/02/14" "CY228725-2017/03/19"
##   [4] "CY224126-2017/02/26" "CY228541-2017/03/23" "CY227534-2017/03/13"
##   [7] "CY232562-2017/04/11" "CY232547-2017/04/13" "CY228301-2017/02/26"
##  [10] "CY230826-2017/04/13" "CY228749-2017/03/21" "CY238923-2017/06/16"
##  [13] "CY227254-2017/02/23" "CY227318-2017/02/23" "CY221910-2017/02/16"
##  [16] "CY223149-2017/02/17" "CY228893-2017/03/09" "CY230730-2017/04/04"
##  [19] "CY227670-2017/03/08" "CY227814-2017/03/15" "CY223277-2017/02/22"
##  [22] "CY224062-2017/02/14" "CY228461-2017/03/02" "CY223085-2017/02/27"
##  [25] "CY223261-2017/02/22" "CY234810-2017/05/01" "CY231099-2017/03/13"
##  [28] "CY223926-2017/02/15" "CY223157-2017/02/20" "CY223021-2017/02/13"
##  [31] "CY230690-2017/03/20" "CY235920-2017/05/14" "CY228373-2017/02/27"
##  [34] "CY235864-2017/03/29" "CY227630-2017/03/14" "CY236035-2017/04/08"
##  [37] "CY225358-2017/03/06" "CY230229-2017/03/07" "CY231125-2017/03/06"
##  [40] "CY223069-2017/02/28" "CY225246-2017/03/05" "CY230594-2017/03/10"
##  [43] "CY234722-2017/04/09" "CY227798-2017/03/16" "CY230890-2017/04/10"
##  [46] "CY235744-2017/04/07" "CY227278-2017/02/22" "MF182513-2017/02/15"
##  [49] "CY230882-2017/04/05" "CY227878-2017/03/16" "CY223133-2017/02/26"
##  [52] "CY234842-2017/04/27" "CY227806-2017/03/17" "CY230770-2017/04/11"
##  [55] "CY238354-2017/03/14" "KY949821-2017/02/14" "CY225406-2017/03/12"
##  [58] "CY234698-2017/04/17" "CY222885-2017/02/16" "CY235824-2017/05/08"
##  [61] "CY235752-2017/04/28" "CY234192-2017/03/13" "CY232618-2017/04/21"
##  [64] "CY235680-2017/04/25" "CY234498-2017/04/17" "CY232372-2017/03/16"
##  [67] "KY949985-2017/02/13" "CY232594-2017/04/02" "CY238851-2017/05/08"
##  [70] "CY228685-2017/03/15" "CY232578-2017/04/03" "CY225238-2017/03/06"
##  [73] "CY234474-2017/04/15" "CY238298-2017/03/06" "CY235624-2017/04/17"
##  [76] "MF182482-2017/03/22" "CY227902-2017/03/21" "CY235792-2017/05/07"
##  [79] "CY232492-2017/04/10" "CY227590-2017/03/06" "KY950048-2017/02/14"
##  [82] "CY238907-2017/06/07" "CY238569-2017/03/31" "CY230277-2017/03/10"
##  [85] "CY232310-2017/03/29" "CY223878-2017/02/13" "CY227662-2017/03/02"
##  [88] "CY230834-2017/03/20" "CY231012-2017/02/20" "CY227870-2017/02/22"
##  [91] "CY227414-2017/02/23" "MF182524-2017/03/10" "CY234272-2017/02/27"
##  [94] "CY232364-2017/03/30" "CY228933-2017/03/30" "CY228589-2017/03/19"
##  [97] "CY221942-2017/02/15" "CY227638-2017/03/14" "CY235712-2017/04/25"
## [100] "CY230021-2017/02/18" "CY224086-2017/02/26" "CY227174-2017/02/20"
## [103] "CY230101-2017/02/26" "CY230069-2017/03/07" "CY223213-2017/02/22"
## [106] "CY227454-2017/03/08" "CY228549-2017/03/28" "CY232356-2017/04/04"
## [109] "CY234562-2017/04/19" "CY235848-2017/04/18" "CY225230-2017/03/06"
## [112] "CY230842-2017/03/31" "CY226998-2017/02/22" "CY235400-2017/04/27"
## [115] "CY230786-2017/04/03" "CY227262-2017/02/21" "CY230117-2017/03/09"
## [118] "CY227302-2017/02/27" "CY225334-2017/03/03" "CY238931-2017/05/22"
## [121] "CY224110-2017/02/27" "MF182461-2017/03/23" "CY234858-2017/04/19"
## [124] "CY232380-2017/03/20" "CY238362-2017/03/21" "CY230285-2017/02/20"
## [127] "CY234690-2017/04/24" "CY223189-2017/02/28" "CY228581-2017/03/23"
## [130] "CY228797-2017/03/20" "CY228845-2017/03/26" "CY233370-2017/03/13"
## [133] "CY227038-2017/02/16" "CY235912-2017/05/22" "CY228757-2017/03/13"
## [136] "CY232468-2017/03/31" "CY231077-2017/03/13" "CY223061-2017/02/27"
## [139] "CY223237-2017/02/25" "CY224094-2017/02/23" "CY234408-2017/03/20"
## [142] "CY230666-2017/03/20" "CY227846-2017/03/02" "CY227742-2017/02/25"
## [145] "CY235976-2017/05/15" "CY235968-2017/05/12" "MF182504-2017/02/22"
## [148] "CY228533-2017/03/20" "CY234522-2017/03/28" "CY233432-2017/03/22"
## [151] "CY232524-2017/04/08" "CY227294-2017/02/26" "CY238915-2017/06/13"
## [154] "CY227326-2017/03/01" "CY227838-2017/03/25" "CY230301-2017/02/27"
## [157] "CY224070-2017/02/22" "CY227830-2017/03/08" "CY231091-2017/03/09"
## [160] "CY225270-2017/02/27" "CY238875-2017/05/30" "CY230530-2017/02/22"
## [163] "CY230357-2017/03/21" "CY227718-2017/03/13" "MF182532-2017/02/21"
## [166] "CY223317-2017/02/27" "CY227206-2017/02/14" "CY227622-2017/03/07"
## [169] "CY234746-2017/05/09" "CY234514-2017/04/05" "CY234482-2017/03/24"
## [172] "CY234874-2017/04/29" "CY233259-2017/03/20" "CY235472-2017/03/04"
## [175] "CY234530-2017/03/30" "CY233448-2017/03/27" "CY234602-2017/04/11"
## [178] "CY238561-2017/03/29" "CY232540-2017/03/31" "CY234834-2017/04/30"
## [181] "CY232340-2017/03/23" "CY231118-2017/03/09" "CY224118-2017/03/06"
## [184] "CY230602-2017/03/14" "CY223125-2017/02/22" "CY225422-2017/03/13"
## [187] "CY227006-2017/02/22" "CY235568-2017/03/27" "CY232610-2017/04/16"
## [190] "CY230714-2017/03/06" "CY227686-2017/03/05" "CY222557-2017/02/23"
## [193] "CY234818-2017/05/08" "CY227446-2017/03/02" "CY234506-2017/04/16"
## [196] "CY232332-2017/03/30" "CY232324-2017/03/30" "KY950005-2017/02/14"
## [199] "CY238662-2017/04/17" "CY223053-2017/02/24"

2 infer phylogeny (ancestral tree)

2.1 UPGMA

calculate distance between sequences

align_n = align[sam_ind,]
align_n = as.phyDat(align_n)
dm = dist.ml(align_n, model="JC69")

infer the tree using upgma

tree_UPGMA = upgma(dm)
plot(tree_UPGMA, show.tip.label=F, main="UPGMA")

2.2 neighbour joining (nj)

infer the tree using nj

# mt = modelTest(align_n)
# print(mt)
tree_NJ = NJ(dm)
plot(tree_NJ, main="Phylogeny: Neighbour Joining", show.tip.label=F)

tree = ladderize(tree_NJ)
plot(tree, main="Phylogeny: Neighbour Joining", show.tip.label=F)

is.rooted(tree_NJ) #is tree rooted
is.binary(tree_NJ) #are tree branches binary
## [1] FALSE
## [1] TRUE

but which of these trees is a better fit for your data? Using the parsimony() function, you can compare their respective parsimony scores.

parsimony(tree_UPGMA, align_n)
parsimony(tree_NJ, align_n)
## [1] 598
## [1] 561

optim.parsimony tries to find the maximum parsimony tree using either Nearest Neighbor Interchange (NNI) rearrangements or sub tree pruning and regrafting (SPR).

tree_optim = optim.parsimony(tree_NJ,align_n)
tree_pratchet = pratchet(align_n)
plot(tree_optim, main="Phylogeny: Maximum Parsimony NNI", show.tip.label=F)

plot(tree_pratchet, main="Phylogeny: Maximum Parsimony SPR", show.tip.label=F)

## Final p-score 559 after  2 nni operations 
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"

2.3 maximum likelihood

maximum likelihood methods allow you to include the full data from your sequence alignment in a statistical framework that estimates model parameters.

the first thing we need to do is create a special type of object of class “pml”. This object includes mostly our tree and data, but also parameters of an evolutionary model, their values (usually set to some default), and the likelihood of the model.

fit = pml(tree_NJ,align_n)
## negative edges length changed to 0!
print(fit)
plot(fit, main="Phylogeny: Neighbour Joining (PML)", show.tip.label=F)

## 
##  loglikelihood: -7200.111 
## 
## unconstrained loglikelihood: -5500.084 
## 
## Rate matrix:
##   a c g t
## a 0 1 1 1
## c 1 0 1 1
## g 1 1 0 1
## t 1 1 1 0
## 
## Base frequencies:  
## 0.25 0.25 0.25 0.25

function optim.pml() will optimize tree topology and branch length for your selected model of nucleotide evolution.

fitJC = optim.pml(fit, model="JC", rearrangement="stochastic")
logLik(fitJC)
plot(fitJC, main="Phylogeny: Neighbour Joining Optimized (PML)", show.tip.label=F)

## optimize edge weights:  -7200.111 --> -7175.895 
## optimize edge weights:  -7175.895 --> -7175.895 
## optimize topology:  -7175.895 --> -7165.78 
## optimize topology:  -7165.78 --> -7165.78 
## optimize topology:  -7165.78 --> -7164.921 
## 4 
## optimize edge weights:  -7164.921 --> -7164.921 
## optimize topology:  -7164.921 --> -7164.921 
## 0 
## [1] "Ratchet iteration  1 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  2 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  3 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  4 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  5 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  6 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  7 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  8 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  9 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  10 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  11 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration  12 , best pscore so far: -7164.92050394874"
## [1] "Ratchet iteration  13 , best pscore so far: -7164.92050394874"
## [1] "Ratchet iteration  14 , best pscore so far: -7164.92050383717"
## [1] "Ratchet iteration  15 , best pscore so far: -7164.92050383717"
## [1] "Ratchet iteration  16 , best pscore so far: -7164.92050383717"
## [1] "Ratchet iteration  17 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  18 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  19 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  20 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  21 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  22 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  23 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  24 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  25 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  26 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  27 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  28 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  29 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  30 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  31 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  32 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  33 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration  34 , best pscore so far: -7164.920503434"
## [1] "Ratchet iteration  35 , best pscore so far: -7164.920503434"
## [1] "Ratchet iteration  36 , best pscore so far: -7164.920503434"
## [1] "Ratchet iteration  37 , best pscore so far: -7164.92049494019"
## [1] "Ratchet iteration  38 , best pscore so far: -7164.92049494019"
## [1] "Ratchet iteration  39 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  40 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  41 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  42 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  43 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  44 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  45 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  46 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  47 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  48 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  49 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  50 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  51 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  52 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  53 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  54 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  55 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  56 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  57 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  58 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration  59 , best pscore so far: -7164.92049435348"
## optimize edge weights:  -7164.92 --> -7164.92 
## optimize topology:  -7164.92 --> -7164.92 
## 0 
## optimize edge weights:  -7164.92 --> -7164.92 
## 'log Lik.' -7164.92 (df=397)

note that the object returned by optim.pml is not a phylogeny, but an optimized object of class “pml” (which contains an optimized phylogeny with edge lengths). Let’s plot this phylogeny:

tree_JC = fitJC$tree
plot(tree_JC, type="phylogram",  lab4ut="horizontal", edge.width = 2, show.tip.label=F)

3 infer ancestral sequences

3.1 reconstruct ancestrial sequences

# what tree do we want to do ancestral sequence reconstruction for?
tree_type = "NJ"
tree = tree_NJ

# convert alignment format
align_npd = phyDat(align_n) 

# parsimony reconstructions: based on the fitch algorithm for bifurcating trees (note: there will be often no unique solution)
anc = ancestral.pars(tree, align_npd, "ACCTRAN")
plotAnc(tree, anc, cex.pie=.5, show.tip.label=F)
title(paste0(tree_type, ": ACCTRAN"))

3.2 match ancestral sequences with known strains

align known influenza virus with ancestral sequences; known strains must have a date preceding their child strains; we assume there are at least 2 layers in a tree

# get sequences ready (combine ancestral with known strains)
seqa_dir = paste(result_dir, "/FASTA_anc.fa", sep="")
sequ_anc = as.DNAbin(lapply(anc, function(x) colnames(x)[apply(x,1,function(y) which.max(y))] ))
write.dna(sequ_anc, file=seqa_dir, format="fasta")

sequ = read.dna(seq_dir, format="fasta")
sequ_anc = read.dna(seqa_dir, format="fasta", as.matrix=F)

seq_all_dir = paste(result_dir, "/FASTA_all.fa", sep="")
sequ_all = append(sequ_anc, sequ)
sequ_all = sequ_all[!(duplicated(names(sequ_all)) | duplicated(names(sequ_all),fromLast=T))] # leaf sequences not included in distance calulcation
write.dna(sequ_all, file=seq_all_dir, format="fasta")

# align sequences, install mafft @ (https://mafft.cbrc.jp/alignment/software/)
align_anc_dir = paste(result_dir, "/alignment_anc.fa", sep="")
if (!file.exists(align_anc_dir)) system(paste("mafft ", seq_all_dir, " > ", align_anc_dir, sep=""))
align_anc = read.dna(align_anc_dir, format="fasta")

calculate distance (ancestral sequences x known sequences) from alignments

# get distances ready
dm_anc_dir = paste(result_dir, "/dm_anc.Rdata", sep="")
if (!file.exists(dm_anc_dir)) {
  dm_anc_ = dist.ml(phyDat(align_anc),model="JC69")
  dm_anc = as.matrix(dm_anc_, sparse=T)
  dm_anc = dm_anc[!rownames(dm_anc)%in%strains_id, colnames(dm_anc)%in%strains_id]
  save(dm_anc, file=dm_anc_dir)
}
dm_anc = get(load(dm_anc_dir))
strains_anc = sapply(strsplit(colnames(dm_anc),"[-]"), function(x) x[1])
dates_anc = as.Date(sapply(strsplit(colnames(dm_anc),"[-]"), function(x) x[2]))
names(dates_anc) = strains_anc

label tree nodes

evo_month = 444 # max number of months needed for a parent strain to evolve into a child strain (max = 444)

# load metadata
meta = read.csv(meta_dir, stringsAsFactors=F)[,-1]
rownames(meta) = meta[,"strain"]

# label nodes starting from root using depth first search
edges = tree$edge[order(tree$edge[,2]),] #first nseq nodes are leaves
edges[1:nseq,2] = tree$tip.label
edges[,2] = sapply(strsplit(edges[,2], "[-]"), function(x) x[1])

tree_labelled = T
nodes = unique(as.vector(edges))
node = unique(edges[!edges[,1]%in%edges[,2],1]) #root
while (sum(!nodes%in%strains)>0) {
  children = sapply(strsplit(edges[edges[,1]==node, 2], "[-]"), function(x) x[1])
  if (any(!children%in%strains)) {
    node = children[!children%in%strains]
    node = node[1]
  } else {
    min_date = min(as.Date(as.character(meta[children, "date"])))
    min_date_ = seq(min_date, length=2, by="-6 months")[2]
    dm_colind = which(dates_anc < min_date & dates_anc > min_date_ &
                        !names(dates_anc)%in%edges)
    if (length(dm_colind)==0) {
      print("incomplete! need data on older strains")
      tree_labelled = F; break()
    }
    edges[edges==node] = nodes[nodes==node] = node_strain = 
      names(dm_colind[which.min(dm_anc[node, dm_colind])])
    # nodes_in = setdiff(nodes_in, node)
    node = edges[edges[,2]==node_strain, 1]
  }
}

3.3 plot

prepare nodelist virus & edgelist vphy

# viral strains
virus = meta[nodes, c("strain", "subtype", "date", "lng", "lat")]

# viral phylogeny
vphy = data.frame(id=seq_len(nrow(edges)),
                  start_lng=meta[edges[,1], "lng"], start_lat=meta[edges[,1], "lat"],
                  end_lng=meta[edges[,2], "lng"], end_lat=meta[edges[,2], "lat"])

plot map

edges are coloured based on the child strain; red -> yellow (ancestor -> new strains)

# heat colours: red (early) to yellow (recent)
colours = sapply(heat.colors(length(unique(virus$date))), function(x) gsub("FF$","",x))
names(colours) = sort(unique(virus$date))

# phylogeny edges
geo_lines = gcIntermediate(vphy[,c("start_lng", "start_lat")], 
                           vphy[,c("end_lng", "end_lat")], 
                           n=200, addStartEnd=T, sp=T, breakAtDateLine=T)

# map
leaflet() %>% 
  addMiniMap(tiles = providers$Esri.OceanBasemap, width = 120, height=80) %>%
  addProviderTiles(providers$Esri.OceanBasemap) %>% 
  addCircleMarkers(data=virus, lng=~lng, lat=~lat,
                   color = as.vector(colours[virus[,"date"]]), weight=2, radius=2,
                   label=paste(virus[,"subtype"], " (", 
                               virus[,"strain"], ") ", 
                               virus[,"date"], sep="")) %>%
  addPolylines(data=geo_lines, color = as.vector(colours[meta[edges[,2],"date"]]),
               weight = 1, opacity = 0.5, fill = FALSE, fillOpacity = 0.5, label = NULL)

keep in mind that our data set contains monstly strains found in he USA, therefore most of the strains on our phylogeny will be in the USA; this could be because USA has the most flu strains, or a more practical reason can be simply because USA collects the most data!

let’s see where the root strain comes from

# display root and leaf nodes
meta[unique(edges[!edges[,1]%in%edges[,2],1]),] # root strain
# meta[unique(edges[!edges[,2]%in%edges[,1],2]),] # leaf strains